rm(list=ls())
knitr::opts_knit$set(root.dir = '~/OneDrive - University College London/Projects/Ostridge_PanAf/baypass/analysing_baypass_output/baypass_core/')
subsps=c('c', 'e', 'ce')
over_write=FALSE
tails=c(0.005, 0.001, 0.0005)
flanks=1000
Here I extract candidates from running the BayPass core model for all subspecies datasets.
BayPass output files we formatted with ./format_baypass_core_output.Rmd where SNP coordinates, gene annotations, coverage data and results from multiple independent runs were added. Statistics with no suffix represent results from the focal run (e.g. ‘XtXst’), and statistics from the the two repeat runs have the suffixes ‘.r1’ and ‘.r2’ (e.g. ‘XtXst.r1’ and ‘XtXst.r2’). Median values have also been calculated.
library(data.table)
options(datatable.fread.datatable=FALSE)
library(dplyr)
library(ggplot2)
library(ggVennDiagram)
library(psych)
library(tidyverse)
library(cowplot)
library(reshape)
library(ggpattern)
library(viridis)
# Call scripts containing custom functions
source("../scripts/baypass_tools.R")
source("scripts/baypass_core_tools.R")
source("scripts/candidate_allele_frequency_patterns.R")
source("../baypass_aux/scripts/baypass_aux_tools.R")
# In
formatted_core_out_dir="output/formatted_baypass_core_output/"
exome_baypass_out_dir="../../running_baypass/exome/output/"
chr21_baypass_out_dir="../../running_baypass/chr21/output/"
# Out
formatted_core_out_fprs_dir="output/baypass_core_output_with_fprs/"
gowinda_snp_file_dir="../../../gowinda/baypass_core/output/gowinda_input/snp_files/"
## - c
## - e
## - ce
NB: These are results from a single run (not median over multiple).
Ideally, the p-value distribution should be flat around 1 and then have an excess at the low end due to selection. Our p-values are not ‘well behaved’ and do not show an excess so the chi squared distribution cannot be used as a null here.
We found that candidates were more likely to have lower coverages than expected from the background. We therefore decided to calculate FPRs within candidate bins to account for this.
FPRs are estimated using a null distribution. SNPs in the null distribution are assigned empirical p values (BF rank/total number of SNPs). The null data is then combined with the exome data and ordered by BF. The FPR for each of the exome SNP is given as the smallest empirical p value of any null SNP with a lower or equal BF.
Null XtXst: 0 1 1 4 8
Ranks: 5 4 4 2 1
p: 1 0.8 0.8 0.4 0.2
Exome XtXst:0 2 3 9 10
FDR: 1 0.8 0.8 0.2 0.2
We can see that in tied instances we use the lowest rank and that the minimum FPR possible is determined by the number of null SNPs.
This is done for each coverage bin.
## Coverage Bin Stats
## Number of Candidates
## Coverage Bin Stats
## Number of Candidates
## Coverage Bin Stats
## c
## e
## ce
## Number of Candidates
NB: Estimated for the number of null SNPs in a bin are inaccurate here. I have to estimate the number of null SNPs in each bin from FPR stats and this relies on the assumption that the highest ranking SNP is an exome SNP (which of course is unlikely the case of the core output). I have checked and this artefact is certainly introduced in this plotting stage in plot_fpr_stats_core() rather than FPR calculation in assign_fpr_core(). This explains why bin 3 always seems to have lots of null SNPs compared to the others e.g. by assuming that the top ranked exome SNP in bin 2 of westerns is actually 2 rather than 1 (i.e. there is a null SNP with a higher value), we estimate the correct number of SNPs. i.e. there are not shown to be too many SNPs in bin 3 but too little in the others.
FPR conclusions
We do not find genome-wide evidence of selection using only genetics because we do not have an excess of highly differentiated SNPs in the exomes compared to the null (non-genic regions of chr21).
We find less SNPs with high allele frequency differentiation in the exomes than in the null likely due to more purifying selection in the exomes.
Nevertheless, it is interesting to investigate the SNPs with the strongest signatures of selection in the exomes as there may still be some true signals in here.
#cand_stat='empirical_p_med_cov'
cand_stat='fpr'
## Common SNPs only
for(tail in tails){
for(subsp in subsps){
if(subsp=='all'){subsp.file=''}else{subsp.file=paste0(".", subsp)}
if(subsp=='n'){miss.pop=0}else{miss.pop=0.3}
if(cand_stat=='fpr'){suffix=paste0(".fpr", 100*tail, "pct.non-genic_",flanks,"bp.flanks")}
if(cand_stat=='empirical_p_med_cov'){suffix=paste0(".", 100*tail,'pct-cov_cor')}
# Write Gowinda input files
write.table(unique(xtx_fpr[[subsp]][xtx_fpr[[subsp]][[cand_stat]]<tail, c("chr", "pos")]),
paste0(gowinda_snp_file_dir, "/", subsp, "/", subsp, suffix),
col.names=F, row.names=F, sep="\t")
}
}
I create separate files for SNPs with higher DAF in Gashaka or KorupMtCameroon
cand_stat='fpr'
for(tail in tails){
for(subsp in 'n'){
if(subsp=='all'){subsp.file=''}else{subsp.file=paste0(".", subsp)}
if(subsp=='n'){miss.pop=0}else{miss.pop=0.3}
if(cand_stat=='fpr'){suffix=paste0(".fpr", 100*tail, "pct.non-genic_",flanks,"bp.flanks")}
if(cand_stat=='empirical_p_med_cov'){suffix=paste0(".", 100*tail,'pct-cov_cor')}
# Allele frequency stats per SNP per population
tmp=allele.freq[['n']][c('chr', 'pos', 'M_P', 'Population')]
tmp=spread(tmp, key = Population, value = M_P)
tmp$Gashaka_minus_KorupMtCameroon=tmp$n.Gashaka-tmp$n.KorupMtCameroon
tmp=merge(xtx_fpr[[subsp]], tmp[c('chr', 'pos', 'Gashaka_minus_KorupMtCameroon')], by=c('chr', 'pos'))
# Write Gowinda input files - I write them in the aux directory as what we are doing here is separating forest (KorupMtCameroon) and 'savannah' (Gashaka) candidates
write.table(unique(tmp[tmp[[cand_stat]]<tail & tmp$Gashaka_minus_KorupMtCameroon>0, c("chr", "pos")]),
paste0("../../../gowinda/baypass_aux/output/gowinda_input/snp_files/f_over_sum_known_trees/n.f_over_sum_known_trees-f_over_sum_known_trees.fpr", 100*tail, "pct.non-genic_1000bp.flanks.neg_beta"),
col.names=F, row.names=F, sep="\t")
write.table(unique(tmp[tmp[[cand_stat]]<tail & tmp$Gashaka_minus_KorupMtCameroon<0, c("chr", "pos")]),
paste0("../../../gowinda/baypass_aux/output/gowinda_input/snp_files/f_over_sum_known_trees/n.f_over_sum_known_trees-f_over_sum_known_trees.fpr", 100*tail, "pct.non-genic_1000bp.flanks.pos_beta"),
col.names=F, row.names=F, sep="\t")
}
}